## This R script computes the gini at metro and national level using the binsmooth package provided by von Hippel, Hunter, and Drown (2017)

## Set the working directory 
setwd("C:\\Users\\IRBXV01\\Dropbox\\Analysis Codes\\Programs")
# setwd("")

## -----------------------------------------------------------------------------
## Load all the required packages (Install if any package is missing)
## -----------------------------------------------------------------------------
library(dplyr)       # for data manipulation
library(haven)       # for reading/writing Stata files
library(stringr)     # for string operations
library(binsmooth)   # for computing smooth CDFs and PDFs from binned data
library(pracma)      # for numerical integration and other mathematical functions
library(parallel)    # for parallel computing (if needed)
library(tidyr)       # for tidying data
library(xtable)      # for exporting tables in LaTeX or HTML format
library(openxlsx)    # for writing Excel files

## -----------------------------------------------------------------------------
## Construct functions to be later used in computation of gini 
## -----------------------------------------------------------------------------

## annualSubset: Subsets the dataset for a given year, then reshapes the data so that 
## each row represents a unique observation with counts spread across income brackets.
annualSubset <- function(dat, var, year, famvar){
  stopifnot(inherits(dat, "data.frame"))
  stopifnot(is.character(var))
  stopifnot(var %in% names(dat))
  stopifnot(is.numeric(year))
  stopifnot(is.character(famvar))
  stopifnot(famvar %in% names(dat))
  
  ## Filter the dataset for the specified year
  dat <- dat[which(dat$year == year), ]
  
  ## Select only the specified variables and family variable count
  dat <- dat[, c(var, famvar)]
  
  ## Reshape the data from long to wide format, using "bracket_no" as column identifiers
  dat <- spread(dat, bracket_no, all_of(famvar))
  dat <- data.frame(dat)
  
  ## Create a new column 'hh' representing the total household count across brackets
  varnames <- names(dat)[grep("X", names(dat))]
  dat$hh <- rowSums(dat[, varnames])
  return(dat)
}

## incomeBins: Generates a table of income bins by grouping the data on year, bracket, and lower bound.
incomeBins <- function(dat){
  stopifnot(inherits(dat, "data.frame"))
  stopifnot(c("year", "bracket_no", "low") %in% names(dat))
  
  ## Group by year, bracket number and lower bound, then count observations
  tab <- dat %>%
    group_by(year, bracket_no, low) %>%
    summarize(n=n(), .groups="drop_last") %>%
    ungroup() %>%
    data.frame()
  return(tab)
}

## incomeDist: Computes the distribution of income by metro area.
## It aggregates both the sum of counts (from income brackets) and the weighted mean income.
incomeDist <- function(dat){
  stopifnot(inherits(dat, "data.frame"))
  stopifnot(c("year", "metro") %in% names(dat))
  stopifnot(c("mean_income", "population") %in% names(dat))
  
  ## Identify income bracket columns
  varnames <- names(dat)[grep("X", names(dat))]
  
  ## Sum counts by year and metro for each income bracket
  metroSums <- dat %>%
    group_by(year, metro) %>%
    summarize(across(all_of(varnames), sum), .groups="drop_last") %>%
    ungroup() %>% data.frame()
  
  ## Compute weighted mean income and total population for each metro area
  metroMeans <- dat %>%
    group_by(year, metro) %>%
    summarize(mean_income = weighted.mean(mean_income, hh, na.rm=TRUE),
              population = sum(population, na.rm=TRUE), .groups="drop_last") %>%
    ungroup() %>% data.frame()
  
  ## Merge the aggregated sums with the computed means/population by metro area
  metro <- merge(metroMeans, metroSums, by=c("year", "metro"))
  return(metro)
}

## smoothCDF: Applies a step function approach to smooth the CDF and compute the gini coefficient.
## It uses the bins from the binsmooth package and then calculates the gini index from the estimated CDF.
smoothCDF <- function(metroMeans, bins){
  stopifnot(inherits(metroMeans, "data.frame"))
  stopifnot(inherits(bins, "data.frame"))
  stopifnot(c("year", "mean_income") %in% names(metroMeans))
  stopifnot(length(unique(metroMeans$year))==1L)
  
  ## Identify the income bracket columns in the metroMeans dataframe
  varnames <- names(metroMeans)[grep("X", names(metroMeans))]
  
  ## Filter the bins data for the relevant year and sort by the lower bound of each bin
  bins <- bins %>%
    filter(year==unique(metroMeans$year)) %>%
    select(low) %>%
    arrange(low)
  
  ## Create a vector of bin edges; note that the top bin is represented by NA (unbounded)
  bins <- c(as.numeric(bins$low), NA)[2:(length(varnames)+1)]
  
  ## Apply the stepbins function to each metro area and calculate the corresponding gini index
  metroMeans$gini <- sapply(1:nrow(metroMeans), function(x){
    sb <- stepbins(bins, unlist(metroMeans[x, varnames]), unlist(metroMeans[x, "mean_income"]))
    gini <- 1 - integral(function(x){(1-sb$stepCDF(x))^2}, 0, sb$E)/integral(function(x){x*sb$stepPDF(x)}, 0, sb$E)
    return(gini)
  })
  return(metroMeans)
}

## giniIndex: Computes the overall weighted and unweighted gini index for a given year.
giniIndex <- function(metroMeans){
  stopifnot(inherits(metroMeans, "data.frame"))
  stopifnot(c("year", "gini", "population") %in% names(metroMeans))
  stopifnot(length(unique(metroMeans$year))==1L)
  
  ## Calculate weighted and unweighted averages of the gini coefficient by year
  giniIndex <- metroMeans %>%
    group_by(year) %>%
    summarize(giniWeighted = weighted.mean(gini, population, na.rm=TRUE),
              giniUnweighted = mean(gini, na.rm=TRUE), .groups="drop_last") %>%
    ungroup() %>% data.frame()
  return(giniIndex)
}

## -----------------------------------------------------------------------------
## Load the required data 
## -----------------------------------------------------------------------------

## Import initial dataset for mean income and population data
base_data <- read_dta("../Output/base_data.dta")
## Extract population information and remove duplicates
basePop <- select(base_data, c("year", "metro", "fips", "population"))
basePop <- basePop[!duplicated(basePop), ]

## Import data set on families with and without kids
kids <- read_dta("../Output/children_data.dta")

## Data set for mean income on families with and without kids
load("../Output/mean_income.Rda", verbose=TRUE)
#write_dta(incomeKids, "../Output/mean_income.dta")

## -----------------------------------------------------------------------------
## Families with kids
## -----------------------------------------------------------------------------
years <- c(1980, 1990, 2000, 2010)

## For families with kids, subset the data for each year and prepare the dataset.
famKidsYear <- lapply(years[1:4], function(x){
  ## Subset children data for the given year
  tab <- annualSubset(kids, c("year", "metro", "fips", "bracket_no"), x, "f_count_ct_kids")
  
  ## Extract corresponding mean income data and rename income column for consistency
  inc <- incomeKids[, c("year", "metro", "fips", "mean_kids_income")]
  names(inc)[4] <- "mean_income"
  
  ## Merge mean income data with population data using fips as a key
  inc <- merge(inc, basePop, by=c("year", "metro", "fips"))
  
  ## Merge the subset data with the income/population data
  tab <- merge(tab, inc, by=c("year", "metro", "fips"), all.x=TRUE)
  
  ## Remove observations where mean income is zero
  if (nrow(tab[which(tab$mean_income==0), ]) != 0){
    tab <- tab[-which(tab$mean_income==0), ]
  }
  return(tab)
})

## Generate the income bins based on the kids data
famBins <- incomeBins(kids)

## Calculate income distribution by metro for families with kids
kidsDist <- lapply(famKidsYear, incomeDist)

## Compute the gini coefficient for each metro area for families with kids
kidsGiniYears <- lapply(kidsDist, function(x) smoothCDF(x, famBins))

## Aggregate the gini indices at the national level (weighted by population)
kidsGini <- do.call(rbind, lapply(kidsGiniYears, giniIndex))

## -----------------------------------------------------------------------------
## Families without kids
## -----------------------------------------------------------------------------
famNoKidsYear <- lapply(years, function(x){
  ## Subset data for families without kids for the given year
  tab <- annualSubset(kids, c("year", "metro", "fips", "bracket_no"), x, "f_count_ct_wokids")
  
  ## Extract corresponding mean income data for families without kids
  inc <- incomeKids[, c("year", "metro", "fips", "mean_no_kids_income")]
  names(inc)[4] <- "mean_income"
  
  ## Merge income data with population data
  inc <- merge(inc, basePop, by=c("year", "metro", "fips"))
  
  ## Merge the subset data with the income/population data
  tab <- merge(tab, inc, by=c("year", "metro", "fips"), all.x=TRUE)
  
  ## Remove observations where mean income is zero
  if (nrow(tab[which(tab$mean_income==0), ]) != 0){
    tab <- tab[-which(tab$mean_income==0), ]
  }
  return(tab)
})

## Compute income distribution by metro for families without kids
nokidsDist <- lapply(famNoKidsYear, incomeDist)

## Compute the gini coefficient for each metro area for families without kids
nokidsGiniYears <- lapply(nokidsDist, function(x) smoothCDF(x, famBins))

## Aggregate the gini indices at the national level (weighted by population)
nokidsGini <- do.call(rbind, lapply(nokidsGiniYears, giniIndex))

## -----------------------------------------------------------------------------
## All Families
## -----------------------------------------------------------------------------
allHHYear <- lapply(years, function(x){
  ## Subset data for all families for the given year
  tab <- annualSubset(base_data, c("year", "metro", "fips", "bracket_no"), x, "f_count_ct")
  
  ## Extract income and population data for all households for the given year
  baseIncome <- base_data[which(base_data$year==x), ]
  baseIncome <- baseIncome[,  c("year", "metro", "fips", "mean_income", "population")]
  baseIncome <- baseIncome[!duplicated(baseIncome), ]
  
  ## Merge the subset data with the income/population data
  tab <- merge(tab, baseIncome, by=c("year", "metro", "fips"), all.x=TRUE)
  
  ## Remove observations where mean income is zero
  if (nrow(tab[which(tab$mean_income==0), ]) != 0){
    tab <- tab[-which(tab$mean_income==0), ]
  }
  return(tab)
})

## Generate income bins for all households based on the base data
allbins <- incomeBins(base_data)

## Compute income distribution by metro for all families
allDist <- lapply(allHHYear, incomeDist)

## Compute the gini coefficient for each metro area for all families
allHHYears <- lapply(allDist, function(x) smoothCDF(x, allbins))

## Aggregate the gini indices at the national level (weighted by population)
allGini <- do.call(rbind, lapply(allHHYears, giniIndex))

## -----------------------------------------------------------------------------
## Merging data: Combine metro-level and national-level gini indices
## -----------------------------------------------------------------------------

## Extract the weighted gini data by year for each group and rename columns accordingly

# For families with kids: rename 'gini' to 'famkids_weighted' and 'famkids_unweighted'
names(kidsGini) <- c("year", "famkids_weighted", "famkids_unweighted")

# For families without kids: rename 'gini' to 'fam_nokids_weighted' and 'fam_nokids_unweighted'
names(nokidsGini) <- c("year", "fam_nokids_weighted", "fam_nokids_unweighted")

# For all families: rename 'gini' to 'all_fam_weighted' and 'all_fam_unweighted'
names(allGini) <- c("year", "all_fam_weighted", "all_fam_unweighted")

## Merge the three national-level gini datasets by year
gini <- list(kids=kidsGini, nokids=nokidsGini, all=allGini)
gini <- Reduce(function(...) merge(..., by="year"), gini)

## Save the national-level gini data to a Stata file
write_dta(gini, "../Output/gini_year_R.dta")


## -----------------------------------------------------------------------------
## Extract the raw gini data by year and metro area (detailed level)
## -----------------------------------------------------------------------------

## Process kidsGiniYears: extract columns and rename "gini" to "famkidsgini"
kids_gini <- do.call(rbind, lapply(kidsGiniYears, function(df) {
  df_subset <- df[, c("year", "metro", "gini")]
  names(df_subset)[names(df_subset) == "gini"] <- "famkidsgini"
  df_subset
}))

## Process nokidsGiniYears: extract columns and rename "gini" to "famnokidsgini"
nokids_gini <- do.call(rbind, lapply(nokidsGiniYears, function(df) {
  df_subset <- df[, c("year", "metro", "gini")]
  names(df_subset)[names(df_subset) == "gini"] <- "famnokidsgini"
  df_subset
}))

## Process allHHYears: extract columns and rename "gini" to "allfamgini"
allHH_gini <- do.call(rbind, lapply(allHHYears, function(df) {
  df_subset <- df[, c("year", "metro", "gini")]
  names(df_subset)[names(df_subset) == "gini"] <- "allfamgini"
  df_subset
}))

## Merge the datasets by year and metro; perform a full outer join to keep all records
metro_gini <- merge(kids_gini, nokids_gini, by = c("year", "metro"), all = TRUE)
metro_gini <- merge(metro_gini, allHH_gini, by = c("year", "metro"), all = TRUE)

## Filter out rows with missing 'allfamgini' values
metro_gini <- metro_gini %>% filter(!is.na(allfamgini)) 

## Merge the metro-level data with the national-level gini data by year
metro_gini <- merge(metro_gini, gini, by="year") 
metro_gini <- metro_gini %>% mutate(ind_msa_gini==1)

## Save the merged metro-level dataset to a Stata file
write_dta(metro_gini, "../Output/metro_gini_all.dta")
